from pyspark.sql import SparkSessionimport pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import monotonically_increasing_idnp.random.seed(42)pio.renderers.default ="vscode+notebook+png"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")# Show Schema and Sample Data# print("---This is Diagnostic check, No need to print it in the final doc---")# df.printSchema() # comment this line when rendering the submission# df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/22 02:26:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/22 02:26:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
[Stage 1:> (0 + 1) / 1]
3 Salary Distribution by Industry and Employment Type
Compare salary variations across industries.
Filter the dataset
Remove records where salary is missing or zero.
Aggregate Data
Group by NAICS industry codes.
Group by employment type and compute salary distribution.
Visualize results
Create a box plot where:
X-axis = NAICS2_NAME
Y-axis = SALARY_FROM
Group by EMPLOYMENT_TYPE_NAME.
Customize colors, fonts, and styles.
3.1 Salary Distribution by Industry
Explanation: This graph reveals that the Information industry offers the highest median salaries (around $140K), while most other industries cluster between $100K and $120K with relatively similar distributions. It also highlights that all industries exhibit substantial salary ranges and numerous outliers, with some positions reaching $400Kâ$500K, indicating significant intra-industry variation in compensation.
# Filtering out missing or zero salary valuespdf = df.filter(df["SALARY"] >0).select("NAICS2_NAME", "SALARY").toPandas()# Cleaning industry namespdf["NAICS2_NAME"] = ( pdf["NAICS2_NAME"] .fillna("") .apply(lambda x: re.sub(r"[^\x00-\x7F]+", "", x).strip()))pdf = pdf[pdf["NAICS2_NAME"].str.len() >0]# Converting salary to $1000 unitspdf["SALARY"] = pdf["SALARY"] /1000# Computing median salary by industry for sortingmedian_salaries = pdf.groupby("NAICS2_NAME")["SALARY"].median()sorted_industries = median_salaries.sort_values(ascending=False).index# Applying sorted categoriespdf["NAICS2_NAME"] = pd.Categorical( pdf["NAICS2_NAME"], categories=sorted_industries, ordered=True)# Creating box plotfig = px.box( pdf, x="NAICS2_NAME", y="SALARY", points="outliers", title="Salary Distribution by Industry", labels={"NAICS2_NAME": "Industry","SALARY": "Salary (in $1000)" }, color_discrete_sequence=["#eb6864"], height=600)fig.update_layout( font=dict(family="Arial", size=12, color="#333333"), plot_bgcolor="white", paper_bgcolor="white", title=dict(x=0.5, xanchor="center", font=dict(size=18)), xaxis=dict( tickangle=45, tickfont=dict(size=10), showgrid=False, zeroline=False, linecolor='black', ticks='outside', showline=True, mirror=True ), yaxis=dict( tick0=0, dtick=100, showgrid=True, gridcolor='lightgray', zeroline=False, linecolor='black', ticks='outside', showline=True, mirror=True ), margin=dict(l=60, r=40, t=80, b=200), boxmode="group", hovermode="x unified")fig.show()fig.write_image("output/Q2_Industry_BoxPlot.svg", width=3500, height=600, scale=1)
[Stage 6:> (0 + 1) / 1]
3.2 Salary Distribution by Employment Type
Explanation: The box plot reveals that full-time employees tend to earn higher median salaries compared to part-time and mixed employment types. However, full-time roles also exhibit a wider salary range and more extreme outliers, indicating greater variability in compensation.
# Filtering out missing or zero salary valuespdf = df.filter(df["SALARY"]>0).select("EMPLOYMENT_TYPE_NAME","SALARY").toPandas()# Cleaning employment type names for better readabilitypdf["EMPLOYMENT_TYPE_NAME"] = ( pdf["EMPLOYMENT_TYPE_NAME"] .fillna("") .apply(lambda x: re.sub(r"[^\x00-\x7F]+", "", x).strip()))pdf = pdf[pdf["EMPLOYMENT_TYPE_NAME"].str.len() >0]# Converting salary to $1000 unitspdf["SALARY"] = pdf["SALARY"] /1000# Computing media salary for sortingmedian_salaries = pdf.groupby("EMPLOYMENT_TYPE_NAME")["SALARY"].median()# Sorting employment types based on median salarysorted_employment_types = median_salaries.sort_values(ascending=False).index# Applying sorted categoriespdf["EMPLOYMENT_TYPE_NAME"] = pd.Categorical( pdf["EMPLOYMENT_TYPE_NAME"], categories=sorted_employment_types, ordered=True)# Creating box plotfig = px.box( pdf, x="EMPLOYMENT_TYPE_NAME", y="SALARY", points="outliers", title="Salary Distribution by Employment Type", labels={"EMPLOYMENT_TYPE_NAME": "Employment Type","SALARY": "Salary (in $1000)" }, color_discrete_sequence=["#eb6864"], height=500)fig.update_layout( font=dict(family="Arial", size=14, color="#333333"), plot_bgcolor="white", paper_bgcolor="white", title=dict(x=0.5, xanchor="center", font=dict(size=18)), xaxis=dict( showgrid=False, zeroline=False, linecolor='black', ticks='outside', showline=True, mirror=True ), yaxis=dict( showgrid=True, gridcolor='lightgray', zeroline=False, linecolor='black', ticks='outside', showline=True, mirror=True, tick0=0, dtick=50 ), margin=dict(l=60, r=40, t=80, b=60), boxmode="group", hovermode="x unified")fig.show()fig.write_image("output/Q1_EMPLOYMENT_TYPE_BoxPlot.svg", width=3000, height=500, scale=1)
[Stage 7:> (0 + 1) / 1]
4 Salary Analysis by ONET Occupation Type (Bubble Chart)
Analyze how salaries differ across ONET occupation types.
Aggregate Data
Compute median salary for each occupation in the ONET taxonomy.
Visualize results
Create a bubble chart where:
X-axis = ONET_NAME
Y-axis = Median Salary
Size = Number of job postings
Apply custom colors and font styles.
# Spark SQL - Median salary and job count per LOT_OCCUPATION_NAMEdf.createOrReplaceTempView("Job_Postings")salary_analysis = spark.sql(""" SELECT LOT_OCCUPATION_NAME AS Occupation_Name, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM Job_Postings GROUP BY LOT_OCCUPATION_NAME ORDER BY Job_Postings DESC LIMIT 10""")# Converting to Pandas Data Framesalary_pd = salary_analysis.toPandas()# Converting salary to $1000 unitssalary_pd["Median_Salary"] = salary_pd["Median_Salary"] /1000# Creating Bubble Chartcustom_coral_scale = [ [0.0, "#f79a96"], [0.25, "#eb6864"], [0.5, "#c6524f"], [0.75, "#a63b39"], [1.0, "#7c2a29"]]fig = px.scatter( salary_pd, x="Occupation_Name", y="Median_Salary", size="Job_Postings", color="Job_Postings", color_continuous_scale=custom_coral_scale, title="Salary Analysis by LOT Occupation Type (Bubble Chart)", labels={"Occupation_Name": "Occupation Type","Median_Salary": "Median Salary (in $1000)","Job_Postings": "Number of Job Postings" }, hover_name="Occupation_Name", width=1000, height=500, size_max=40)fig.update_layout( font=dict(family="Arial", size=14, color="#333333"), title=dict(x=0.5, xanchor="center", font=dict(size=18)), plot_bgcolor="white", paper_bgcolor="white", xaxis=dict( tickangle=45, tickfont=dict(size=11), showline=True, linecolor='black', title_standoff=10, showgrid=False, zeroline=False, ticks='outside', mirror=True ), yaxis=dict( tick0=0, dtick=25,range=[50, 180], showline=True, linecolor='black', title_standoff=10, showgrid=True, gridcolor='lightgray', zeroline=False, ticks='outside', mirror=True ), coloraxis_colorbar=dict( tickvals=[5000, 10000, 15000, 20000, 25000, 30000], ticktext=["5k", "10k", "15k", "20k", "25k", "30k"], title="Number of Job Postings" ), margin=dict(l=60, r=40, t=80, b=140), hovermode="closest")fig.show()fig.write_image("output/Q3_BubbleChart_V1.svg", width=1500, height=600, scale=1)
25/09/22 02:27:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 8:> (0 + 1) / 1]
Explanation: The bubble chart reveals that while job posting volumes vary widely across LOT occupation types, the median salaries appear uniformly clustered. This pattern is likely due to an earlier step where missing salary values were imputed with the overall median, flattening natural variation. Therefore, we take a further step to filter out the most common imputed value to improve accuracy.
# Spark SQL - Median salary and job count per LOT_OCCUPATION_NAMEdf.createOrReplaceTempView("Job_Postings")salary_analysis = spark.sql(""" SELECT LOT_OCCUPATION_NAME AS Occupation_Name, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM Job_Postings WHERE SALARY IS NOT NULL AND SALARY > 0 AND SALARY NOT IN (115024.0) AND LOT_OCCUPATION_NAME IS NOT NULL GROUP BY LOT_OCCUPATION_NAME ORDER BY Job_Postings DESC LIMIT 10""")# Converting to Pandas Data Framesalary_pd = salary_analysis.toPandas()# Converting salary to $1000 unitssalary_pd["Median_Salary"] = salary_pd["Median_Salary"] /1000# Creating Bubble Chartcustom_coral_scale = [ [0.0, "#f79a96"], [0.25, "#eb6864"], [0.5, "#c6524f"], [0.75, "#a63b39"], [1.0, "#7c2a29"]]fig = px.scatter( salary_pd, x="Occupation_Name", y="Median_Salary", size="Job_Postings", color="Job_Postings", color_continuous_scale=custom_coral_scale, title="Salary Analysis by LOT Occupation Type (Bubble Chart)", labels={"Occupation_Name": "Occupation Type","Median_Salary": "Median Salary (in $1000)","Job_Postings": "Number of Job Postings" }, hover_name="Occupation_Name", width=1000, height=500, size_max=40)fig.update_layout( font=dict(family="Arial", size=14, color="#333333"), title=dict(x=0.5, xanchor="center", font=dict(size=18)), plot_bgcolor="white", paper_bgcolor="white", xaxis=dict( tickangle=45, tickfont=dict(size=11), showline=True, linecolor='black', title_standoff=10, showgrid=False, zeroline=False, ticks='outside', mirror=True ), yaxis=dict( tick0=0, dtick=25,range=[50, 180], showline=True, linecolor='black', title_standoff=10, showgrid=True, gridcolor='lightgray', zeroline=False, ticks='outside', mirror=True ), coloraxis_colorbar=dict( tickvals=[5000, 10000, 15000, 20000, 25000, 30000], ticktext=["5k", "10k", "15k", "20k", "25k", "30k"], title="Number of Job Postings" ), margin=dict(l=60, r=40, t=80, b=140), hovermode="closest")fig.show()fig.write_image("output/Q3_BubbleChart_V2.svg", width=1500, height=600, scale=1)
[Stage 11:> (0 + 1) / 1]
Explanation: The bubble chart shows that job posting volumes vary significantly across LOT occupation types, with roles like Data/Data Mining Analyst and Business Intelligence Analyst dominating in demand. Median salaries, however, differ more clearly after filtering out previously imputed values,revealing that roles in system architecture and engineering tend to command higher median pay.
5 Salary by Education Level
Create four groups:
Associateâs or lower (GED, Associate, No Education Listed)
Bachelorâs (Bachelorâs degree)
Masterâs (Masterâs degree)
PhD (PhD, Doctorate, professional degree)
Plot scatter plots for each group using, MAX_YEARS_EXPERIENCE (with jitter), Average_Salary, LOT_V6_SPECIALIZED_OCCUPATION_NAME
Short explanation of key insights for each graph:
Associate or Lower: Most salaries cluster between $50Kâ$150K. A few outliers exceed $300Kâ$700K, but they are extremely rare and likely exceptional cases. Increasing years of experience does not significantly improve salary for this group, indicating a potential ceiling for career growth without further education.
Bachelorâs: There is a noticeable upward trendâsalaries increase with experience more consistently than the Associate group. Most salaries fall within $70Kâ$150K, with the median visibly higher than the Associate group. Broader range of occupations are represented, potentially offering better career mobility.
Masterâs: Few roles pay below $100K, indicating a stronger starting point compared to the lower education groups. Despite the smaller sample size, salaries for experienced professionals (5+ years) often exceed $150K. More linear progression, suggesting Masterâs degree offers good return on investment for salary advancement.
PhD: Limited sample, but high value. Even with low years of experience, this group commands higher salariesâlikely due to specialized roles or niche expertise. This suggests deep specialization can offset lack of experience when entering the job market.
from pyspark.sql.functions import col, when, rand# Defining education level groupingsassociate_or_lower = ["GED", "Associate", "No Education Listed", "High school"]bachelor = ["Bachelor"]master = ["Master"]phd = ["Ph.D", "Doctorate", "professional degree"]# Adding EDU_GROUP columndf = df.withColumn("EDU_GROUP", when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in associate_or_lower])), "Associate or Lower") .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in bachelor])), "Bachelorâs") .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in master])), "Masterâs") .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in phd])), "PhD") .otherwise("Other"))# Casting necessary columns to floatdf = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))# Adding jitter to avoid overlapping dots in scatter plotdf = df.withColumn("Jittered_Experience", col("MAX_YEARS_EXPERIENCE") + (rand() -0.5))# Filtering for non-null and positive valuesdf = df.filter( col("MAX_YEARS_EXPERIENCE").isNotNull() & col("Average_Salary").isNotNull() & (col("MAX_YEARS_EXPERIENCE") >0) & (col("Average_Salary") >0))# Keeping only four major groupsdf_filtered = df.filter( col("EDU_GROUP").isin("Associate or Lower", "Bachelorâs", "Masterâs", "PhD"))# Converting to Pandas for plottingdf_pd = df_filtered.select("EDU_GROUP","Jittered_Experience","Average_Salary","LOT_V6_SPECIALIZED_OCCUPATION_NAME").toPandas()# Creating scatter plots for each groupedu_groups = ["Associate or Lower", "Bachelorâs", "Masterâs", "PhD"]for group in edu_groups: subset = df_pd[df_pd["EDU_GROUP"] == group].copy() subset["Salary_K"] = subset["Average_Salary"] /1000 fig = px.scatter( subset, x="Jittered_Experience", y="Salary_K", color="LOT_V6_SPECIALIZED_OCCUPATION_NAME", title=f"Experience vs Salary â {group}", labels={"Jittered_Experience": "Years of Experience","Salary_K": "Average Salary (in $1000)","LOT_V6_SPECIALIZED_OCCUPATION_NAME": "Occupation" }, opacity=0.7, width=1000, height=600 ) fig.update_layout( font=dict(family="Arial", size=14, color="#333333"), title=dict(x=0.5, xanchor="center", font=dict(size=18)), plot_bgcolor="white", paper_bgcolor="white", xaxis=dict( tickangle=0, tickfont=dict(size=12), showline=True, linecolor='black', title_standoff=10, showgrid=False, zeroline=False, ticks='outside', mirror=True ), yaxis=dict( title="Average Salary (in $1000)", tick0=0, dtick=25, showline=True, linecolor='black', title_standoff=10, showgrid=True, gridcolor='lightgray', zeroline=False, ticks='outside', mirror=True ), margin=dict(l=60, r=40, t=80, b=60), legend_title_text="Occupation", hovermode="closest" ) fig.show() safe_group = group.replace("â", "").replace(" ", "_") fig.write_image(f"output/Q4_experience_salary_{safe_group}.svg", width=1500, height=600, scale=1)
[Stage 14:> (0 + 1) / 1]
6 Salary by Remote Work Type
Split into three groups based on REMOTE_TYPE_NAME:
Remote
Hybrid
Onsite (includes [None] and blank)
Plot scatter plots for each group using, MAX_YEARS_EXPERIENCE (with jitter), Average_Salary, LOT_V6_SPECIALIZED_OCCUPATION_NAME
Also, create salary histograms for all three groups.
After each graph, briefly describe any patterns or comparisons.